Set working directory

# Ana's WD
# setwd("C:/Users/Ana-Maria's Laptop/Documents/GitHub/infoVisualization")

# Andrei's WD
# setwd("C:/Users/andre/Documents/infoVisualization/")

Imports - libraries and read dataset

#install.packages("png")
#install.packages("gifsky")
#install.packages("gganimate")
#install.packages("gapminder")
#install.packages("ggmap")
#install.packages("esquisse")
#install.packages("ggiraph")
#install.packages("gdtools")
#install.packages("leaflet") # interactive maps
#install.packages("plotly")
#install.packages("tseries")
#install.packages("astsa")
#install.packages("sqldf")





library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(gganimate)
## Warning: package 'gganimate' was built under R version 4.0.5
library(gapminder)
## Warning: package 'gapminder' was built under R version 4.0.5
#library(tidyverse)
library(ggmap)
## Warning: package 'ggmap' was built under R version 4.0.5
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(ggiraph)
## Warning: package 'ggiraph' was built under R version 4.0.5
library(esquisse)
## Warning: package 'esquisse' was built under R version 4.0.5
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.0.5
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
## 
##     wind
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# times series libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tseries)
## Warning: package 'tseries' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(astsa)
library(sqldf)
## Warning: package 'sqldf' was built under R version 4.0.5
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.0.5
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.0.5
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.0.5
# esquisser(df)
earthquakes_csv = "eartquakes_Romania.csv"
df <- read.csv(file = earthquakes_csv)
# convert time column to R timestap
df$time <- gsub('T', ' ', df$time)
df$time <- gsub('Z', '', df$time)
df$time <- as.POSIXct(df$time, format="%Y-%m-%d %H:%M:%OS", tz=Sys.timezone())

#split timestamp into date and time
df$timeYear <- as.Date(df$time)
df$timeDay <- format(df$time, "%H:%M:%OS")

df$year <- format(df$time, "%Y")
df$month <- format(df$time, "%m")

df$yearInt <- as.integer(df$year)
# see data types
str(df)
## 'data.frame':    665 obs. of  27 variables:
##  $ time           : POSIXct, format: "2017-03-08 13:43:13" "2017-02-08 15:08:20" ...
##  $ latitude       : num  45.7 45.5 45.7 45.7 45.9 ...
##  $ longitude      : num  26.5 26.3 26.7 26.5 26.8 ...
##  $ depth          : num  149 127 129 97 90 ...
##  $ mag            : num  4.1 4.7 4.4 5.6 4.1 5.6 4.2 2.9 4.3 4.3 ...
##  $ magType        : chr  "mb" "mb" "mb" "mww" ...
##  $ nst            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ gap            : num  41 19 48 14 NA 10 73 NA NA 30 ...
##  $ dmin           : num  0.259 0.222 0.559 0.466 NA ...
##  $ rms            : num  0.76 1.1 1.23 0.8 1.43 0.96 0.68 1.18 0.63 1.09 ...
##  $ net            : chr  "us" "us" "us" "us" ...
##  $ id             : chr  "us100087pk" "us20008ii5" "us20008ifq" "us10007n3r" ...
##  $ updated        : chr  "2017-04-20T10:53:57.224Z" "2017-03-01T06:16:37.044Z" "2017-03-05T08:24:32.472Z" "2017-03-23T22:52:05.040Z" ...
##  $ place          : chr  "19km N of Gura Teghii, Romania" "11km NNW of Nehoiu, Romania" "4km WNW of Nereju, Romania" "14km W of Nereju, Romania" ...
##  $ type           : chr  "earthquake" "earthquake" "earthquake" "earthquake" ...
##  $ horizontalError: num  3.2 6.6 6.4 4.3 6.7 4 3 4.2 3.8 4.9 ...
##  $ depthError     : num  3.9 5.4 5.4 1.8 6.4 1.8 4.2 8 5 6.6 ...
##  $ magError       : num  0.127 0.045 0.148 NA 0.157 NA 0.062 NA 0.265 0.111 ...
##  $ magNst         : int  17 152 13 NA 11 NA 25 NA 4 23 ...
##  $ status         : chr  "reviewed" "reviewed" "reviewed" "reviewed" ...
##  $ locationSource : chr  "us" "us" "us" "us" ...
##  $ magSource      : chr  "us" "us" "us" "us" ...
##  $ timeYear       : Date, format: "2017-03-08" "2017-02-08" ...
##  $ timeDay        : chr  "13:43:13" "15:08:20" "09:52:06" "23:20:56" ...
##  $ year           : chr  "2017" "2017" "2017" "2016" ...
##  $ month          : chr  "03" "02" "02" "12" ...
##  $ yearInt        : int  2017 2017 2017 2016 2016 2016 2016 2016 2016 2015 ...

Describing the dataset: time - self-explanatory, includes: year, month, day, hour, minute, seconds, milliseconds latitude & longitude - the coordinates of the earthquake source depth - the depth at which it originated mag - the physical size of the earthquake Great 8 or more Major 7 - 7.9 Strong 6 - 6.9 Moderate 5 - 5.9 Light 4 - 4.9 Minor 3 - 3.9 magType - category based on magnitude range and distance range. Details: https://www.usgs.gov/natural-hazards/earthquake-hazards/science/magnitude-types?qt-science_center_objects=0#qt-science_center_objects

We start by exploring the variance in the dataset. This way we can better understand each column, besides its initial definition.

mag_hist <- ggplot(df) + 
  geom_histogram(mapping = aes(x = mag), fill = "steelblue", bins = 40) +
  theme_minimal()
mag_hist + ggtitle("Number of recorded earthquakes by magnitude")

year_bar <- ggplot(df) + 
  geom_bar(mapping = aes(y = year), fill = "steelblue") +
  theme_minimal()
year_bar + ggtitle("Number of earthquakes per year (1990 - 2017)")

month_bar <- ggplot(df) + 
  geom_bar(mapping = aes(x = month), fill = "steelblue") +
  theme_minimal()
month_bar + ggtitle("Number of earthquakes by month (1990 - 2017)")

depth_bar <- ggplot(df) + 
  geom_histogram(mapping = aes(x = depth), fill = "steelblue") +
  theme_minimal()
depth_bar + ggtitle("Number of earthquakes by depth")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

magtype_bar <- ggplot(df) + 
  geom_bar(mapping = aes(x = magType), fill = "steelblue") +
  theme_minimal()
magtype_bar + ggtitle("Magnitude type (category)")

nst_bar <- ggplot(df) + 
  geom_histogram(mapping = aes(x = nst), fill = "steelblue", bins=50) +
  theme_minimal()
nst_bar + ggtitle("Nst - number of seismic stations reporting")
## Warning: Removed 270 rows containing non-finite values (stat_bin).

gap_bar <- ggplot(df) + 
  geom_histogram(mapping = aes(x = gap), fill = "steelblue") +
  theme_minimal()
gap_bar + ggtitle("Gap Value (azimuthal gap range)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 250 rows containing non-finite values (stat_bin).

dmin_bar <- ggplot(df) + 
  geom_histogram(mapping = aes(x = dmin), fill = "steelblue") +
  theme_minimal()
dmin_bar + ggtitle("Dmin Value (distance from epicenter)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 620 rows containing non-finite values (stat_bin).

rms_bar <- ggplot(df) + 
  geom_histogram(mapping = aes(x = rms), fill = "steelblue") +
  theme_minimal()
rms_bar + ggtitle("Rms Value (root-mean-squared residual of solution)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 331 rows containing non-finite values (stat_bin).

net_bar <- ggplot(df) + 
  geom_bar(mapping = aes(x = net), fill = "steelblue") +
  theme_minimal()
net_bar + ggtitle("Net Value")

type_bar <- ggplot(df) + 
  geom_bar(mapping = aes(x = type), fill = "steelblue") +
  theme_minimal()
type_bar + ggtitle("Type of data entry")

status_bar <- ggplot(df) + 
  geom_bar(mapping = aes(x = status), fill = "steelblue") +
  theme_minimal()
status_bar + ggtitle("Status of data entry")

locationSource_bar <- ggplot(df) + 
  geom_bar(mapping = aes(x = locationSource), fill = "steelblue") +
  theme_minimal()
locationSource_bar + ggtitle("Location: data source")

magSourceSource_bar <- ggplot(df) + 
  geom_bar(mapping = aes(x = magSource), fill = "steelblue") +
  theme_minimal()
magSourceSource_bar + ggtitle("Magnitude: data source")

horizontalError_hist <- ggplot(df) + 
  geom_histogram(mapping = aes(x = horizontalError), fill = "steelblue") +
  theme_minimal()
horizontalError_hist + ggtitle("Horizontal Error")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 630 rows containing non-finite values (stat_bin).

depthError_hist <- ggplot(df) + 
  geom_histogram(mapping = aes(x = depthError), fill = "steelblue") +
  theme_minimal()
depthError_hist + ggtitle("Depth Error ")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 447 rows containing non-finite values (stat_bin).

magError_hist <- ggplot(df) + 
  geom_histogram(mapping = aes(x = magError), fill = "steelblue") +
  theme_minimal()
magError_hist + ggtitle("Magnitude Error")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 617 rows containing non-finite values (stat_bin).

magNst_hist <- ggplot(df) + 
  geom_histogram(mapping = aes(x = magNst), fill = "steelblue") +
  theme_minimal()
magNst_hist + ggtitle("Number of stations that reported the magnitude ")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 403 rows containing non-finite values (stat_bin).

We start by exploring the most obvious questions: where do most earthquakes happen, including their magnitude as well

ggplot(data=df) +
  geom_point(mapping = aes(x = longitude, y = latitude, color=mag))

As we can see, there is a big cluster where most earthquakes happen, with a smaller cluster that is noticeable, followed by sparsely distributed cases Let’s now plot them on a map for a better visualization. Based on common knowledge of earthquakes in Romania, can you guess where the main epicenter will be?

(we are using stamen source for the map, but the commented code also uses google maps. It requires an API key, though)

romania_map <- get_map("Brasov", maptype = "roadmap", zoom = 6)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=Brasov&zoom=6&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx-e9WaBDXrHnFl2Tud9HT6DdUPuvs
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Brasov&key=xxx-e9WaBDXrHnFl2Tud9HT6DdUPuvs
romania_box <- c(20, 43, 30, 48.5)
romania_map2 <- get_map(location = romania_box, source = "stamen", maptype = "toner", zoom = 7)
## Source : http://tile.stamen.com/terrain/7/71/44.png
## Source : http://tile.stamen.com/terrain/7/72/44.png
## Source : http://tile.stamen.com/terrain/7/73/44.png
## Source : http://tile.stamen.com/terrain/7/74/44.png
## Source : http://tile.stamen.com/terrain/7/71/45.png
## Source : http://tile.stamen.com/terrain/7/72/45.png
## Source : http://tile.stamen.com/terrain/7/73/45.png
## Source : http://tile.stamen.com/terrain/7/74/45.png
## Source : http://tile.stamen.com/terrain/7/71/46.png
## Source : http://tile.stamen.com/terrain/7/72/46.png
## Source : http://tile.stamen.com/terrain/7/73/46.png
## Source : http://tile.stamen.com/terrain/7/74/46.png
## Source : http://tile.stamen.com/terrain/7/71/47.png
## Source : http://tile.stamen.com/terrain/7/72/47.png
## Source : http://tile.stamen.com/terrain/7/73/47.png
## Source : http://tile.stamen.com/terrain/7/74/47.png
#ggmap(romania_map) + 
#  geom_point(data = df, aes(x = longitude, y = latitude), color = "steelblue", size = 1) 

ggmap(romania_map2) + 
  geom_point(data = df, aes(x = longitude, y = latitude, color = mag), size = 1) 

As we can see, most earthquakes are in Vrancea County, as expected.

Let’s also make an interactive map to play around it

We have colored each pin by earthquake classification type (based on magnitude) and have grouped them in clusters (and an extra without clusters just for showcase).

green: minor blue: light pink: moderate orange: strong red: major black: great

As we can see, most of the recorder earthquakes are either minor or light. As we go up in magnitude, the number of earthquakes decreases.

getColor <- function(quakes) {
  sapply(quakes$mag, function(mag) {
  if (mag < 4) {
    "green"
  } else if (mag < 5) {
    "blue"
  } else if (mag < 6) {
    "pink"
  } else if (mag < 7) {
    "orange"
  } else if (mag < 8) {
    "red"
  } else {
    "black"
  } })
}

icons <- awesomeIcons(
  icon = 'ios-close',
  iconColor = 'black',
  library = 'ion',
  markerColor = getColor(df)
)

leaflet(data = df) %>% 
  addTiles() %>%
  addAwesomeMarkers(~longitude, ~latitude, popup = ~as.character(year), label = ~as.character(mag), icon=icons, clusterOptions = markerClusterOptions())
leaflet(data = df) %>% 
  addTiles() %>%
  addAwesomeMarkers(~longitude, ~latitude, popup = ~as.character(year), label = ~as.character(mag), icon=icons)

Let’s start exploring covariance, as well - starting with the number of earthquakes per magnitude, each year. We started by using a barplot and a freqpoly, but it is quite hard to get useful information due to different number of earthquakes in each year. Thus, we switched to density plot which takes that into consideration, as well.

ggplot(data=df) +
  geom_bar(mapping = aes(x = mag, fill = year))
## Warning: position_stack requires non-overlapping x intervals

ggplot(data=df) +
  geom_freqpoly(mapping = aes(x = mag, color = year))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=df) +
  geom_density(mapping = aes(x = mag, color = year))

Due to the number of years, it is still hard to read the graphs. Let’s use a facet_wrap for a better understanding.

See? Way better.

ggplot(data=df) +
  geom_bar(mapping = aes(x = mag, fill = year)) +
  facet_wrap(~ year) +
  theme_minimal()

ggplot(data=df) +
  geom_freqpoly(mapping = aes(x = mag, color = year)) +
  facet_wrap(~ year) +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=df) +
  geom_density(mapping = aes(x = mag, color = year)) +
  facet_wrap(~ year) +
  theme_minimal()

There is obviously a correlation between the location and the number of earthquakes. Let’s see if we can find other correlations, as well. How about between the depth and the magnitude

ggplot(data=df) +
  geom_point(mapping = aes(x = depth, y = mag, color = year))

ggplot(data=df) +
  geom_point(mapping = aes(x = depth, y = mag, color = year)) +
  facet_wrap(~ year) +
  theme_minimal()

ggplot(data=df) +
  geom_line(mapping = aes(x = depth, y = mag, color = year)) +
  facet_wrap(~ year) +
  theme_minimal()

Ok, let’s switch to 3D plots as it seems we need to take into account more than 2 columns.

Starting with longitude, latitude and magnitude.

longlatmag <- plot_ly(df, x = ~longitude, y = ~latitude, z = ~mag,
               marker = list(color = ~mag, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE))
longlatmag <- longlatmag %>% add_markers()
longlatmag <- longlatmag %>% layout(scene = list(xaxis = list(title = 'Longitude'),
                                   yaxis = list(title = 'Latitude'),
                                   zaxis = list(title = 'Magnitude')),
                      annotations = list(
                        x = 1.13,
                        y = 1.05,
                        text = 'Magnitude',
                        xref = 'mag',
                        yref = 'mag',
                        showarrow = FALSE
                        ))
longlatmag

Ok, now let’s take a look at the latitude, longitude and depth.

It is interesting to see that while there are 2 obvious clusters regarding the location, one cluster (Vrancea one) definitely has earthquakes with more depth. We have also colored them based on magnitude.

longlatdepth <- plot_ly(df, x = ~longitude, y = ~latitude, z = ~depth,
               marker = list(color = ~mag, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE))
longlatdepth <- longlatdepth %>% add_markers()
longlatdepth <- longlatdepth %>% layout(scene = list(xaxis = list(title = 'Longitude'),
                                   yaxis = list(title = 'Latitude'),
                                   zaxis = list(title = 'Depth')),
                      annotations = list(
                        x = 1.13,
                        y = 1.05,
                        text = 'Magnitude',
                        xref = 'mag',
                        yref = 'mag',
                        showarrow = FALSE
                        ))
longlatdepth

Let’s do some more 3D exploring. How do the errors look? And let’s color them based on magnitude to see if the errors are correlated with the recorder magnitude

longlatdepth <- plot_ly(df, x = ~horizontalError, y = ~depthError, z = ~magError,
               marker = list(color = ~mag, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE))
longlatdepth <- longlatdepth %>% add_markers()
longlatdepth <- longlatdepth %>% layout(scene = list(xaxis = list(title = 'Horizontal Error'),
                                   yaxis = list(title = 'Depth Error'),
                                   zaxis = list(title = 'Magnitude Error')),
                      annotations = list(
                        x = 1.13,
                        y = 1.05,
                        text = 'Magnitude',
                        xref = 'mag',
                        yref = 'mag',
                        showarrow = FALSE
                        ))
longlatdepth
## Warning: Ignoring 635 observations

Ok, now let’s take a look at some animations. (be patient when loading them) Not extremely useful, but super cool to look at!

ggplot(data = df) +
  geom_point(mapping = aes(x = longitude, y = latitude, color = mag)) +
  labs(title = 'Year: {frame_time}', x = 'Longitude', y = 'Latitude') +
  transition_time(yearInt) +
  ease_aes('linear')

ggplot(data = df) +
  geom_bar(mapping = aes(x = mag), fill = 'steelblue') +
  labs(title = 'Year: {frame_time}', x = 'Mag') +
  transition_time(yearInt) +
  ease_aes('linear')

### CHECK FOR STATIONARITY AND MAKE 
### DATA STATIONARY

### Magnitude time series

get_ts_matrix <- function(ts_magnitude) {
  time <- 0
  mag <- 0
  for (index in 1:nrow(ts_magnitude)) {
    time[index] <- ts_magnitude[index,1]
    mag[index] <- ts_magnitude[index,2]
  }
  mag_matrix <- matrix(ts_magnitude$df.mag, 
                       nrow =length(ts_magnitude$df.timeYear),
                       dimnames = list(ts_magnitude$df.timeYear, "data"))
  return(mag_matrix)
}

ts_magnitude <- data.frame(df$timeYear, df$mag)

ts_magnitude_plot <- ggplot(ts_magnitude, aes(x=df.timeYear, y=df.mag)) +
  ggtitle("Magnitude time series") +
  geom_line() + 
  xlab("")

ts_magnitude_plot

mag_matrix <- get_ts_matrix(ts_magnitude)
acf2(ts(mag_matrix))

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## ACF  0.32 0.29 0.32 0.32 0.30 0.29 0.24 0.29 0.26  0.29  0.29  0.29  0.29  0.31
## PACF 0.32 0.21 0.21 0.17 0.13 0.10 0.03 0.09 0.05  0.09  0.08  0.08  0.07  0.08
##      [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
## ACF   0.27  0.28  0.28  0.30  0.23  0.25  0.21  0.19  0.24  0.23  0.24  0.23
## PACF  0.03  0.03  0.04  0.06 -0.03  0.01 -0.04 -0.06  0.03  0.02  0.04  0.00
##      [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
## ACF   0.20  0.22  0.21  0.17   0.2  0.17  0.20  0.21  0.16  0.20
## PACF -0.02 -0.01 -0.02 -0.05   0.0 -0.02  0.03  0.04 -0.02  0.03
### First diff time series
ts_magnitude$df.mag<- c(0, diff(df$mag))

ts_diff_magnitude_plot <- ggplot(ts_magnitude, aes(x=df.timeYear, y=df.mag)) +
  ggtitle("First diff - Magnitude time series") +
  geom_line() + 
  xlab("")

ts_diff_magnitude_plot

diff_mag_matrix <- get_ts_matrix(ts_magnitude)
acf2(ts(diff_mag_matrix))

##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11] [,12]
## ACF  -0.48 -0.04  0.02  0.01 -0.01  0.03 -0.07  0.05 -0.04  0.02   0.0  0.00
## PACF -0.48 -0.36 -0.26 -0.19 -0.15 -0.07 -0.13 -0.08 -0.11 -0.10  -0.1 -0.09
##      [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## ACF  -0.01  0.04 -0.03  0.00 -0.01  0.06 -0.06  0.05 -0.02 -0.05  0.04 -0.01
## PACF -0.09 -0.04 -0.04 -0.05 -0.07  0.02 -0.02  0.04  0.05 -0.04 -0.02 -0.05
##      [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
## ACF   0.02  0.00 -0.03  0.02  0.02 -0.05  0.04 -0.04  0.02  0.04 -0.06  0.02
## PACF -0.01  0.01  0.00  0.01  0.04 -0.01  0.02 -0.04 -0.05  0.01 -0.04 -0.04
### Second diff time series

### ts_magnitude$df.mag<- c(0, diff(ts_magnitude$df.mag))

ts_second_diff_magnitude_plot <- ggplot(ts_magnitude, aes(x=df.timeYear, y=df.mag)) +
  ggtitle("Second diff - Magnitude time series") +
  geom_line() + 
  xlab("")

ts_second_diff_magnitude_plot

second_diff_mag_matrix <- get_ts_matrix(ts_magnitude)
acf2(ts(second_diff_mag_matrix))

##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11] [,12]
## ACF  -0.48 -0.04  0.02  0.01 -0.01  0.03 -0.07  0.05 -0.04  0.02   0.0  0.00
## PACF -0.48 -0.36 -0.26 -0.19 -0.15 -0.07 -0.13 -0.08 -0.11 -0.10  -0.1 -0.09
##      [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## ACF  -0.01  0.04 -0.03  0.00 -0.01  0.06 -0.06  0.05 -0.02 -0.05  0.04 -0.01
## PACF -0.09 -0.04 -0.04 -0.05 -0.07  0.02 -0.02  0.04  0.05 -0.04 -0.02 -0.05
##      [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36]
## ACF   0.02  0.00 -0.03  0.02  0.02 -0.05  0.04 -0.04  0.02  0.04 -0.06  0.02
## PACF -0.01  0.01  0.00  0.01  0.04 -0.01  0.02 -0.04 -0.05  0.01 -0.04 -0.04
### ACF indicates q
### PACF indicates p
### arima(p,diff,q)

data <- ts(diff_mag_matrix)

ma1<-arima(data, order=c(0,0,1))

my_model <- arima(data, order=c(1,2,2))
stdres <- my_model$residuals/sqrt(my_model$sigma2)
plot(my_model$residuals, main = "Chosen model - ARIMA(1, 3, 2) - Residuals")

qqnorm(stdres, main = "Chosen model - ARIMA(1, 3, 2) - Normal Q-Q Plot of Std Residuals")

shapiro.test(residuals(my_model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(my_model)
## W = 0.98779, p-value = 2.386e-05
year_mag_mean <- aggregate(list("MagMean"=df[, 5]), list("Year"=df$year), mean)
year_df <- data.frame(year_mag_mean$Year, year_mag_mean$MagMean)

time <- 0
mag <- 0
for (index in 1:nrow(year_df)) {
  time[index] <- year_df[index,1]
  mag[index] <- year_df[index,2]
}

data_matrix <- matrix(mag, 
                      nrow =length(time),
                      dimnames = list(time, "data"))
data_ts <- ts(data_matrix, start = 1990)
ts.plot(data_ts, 
        main = "Romania Earthquake - Magnitude mean per year",
        xlab="Year", 
        ylab="Magnitude mean", 
        col="#3c4e66",
        lwd=1)

acf2(data_ts)

##      [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]
## ACF  0.57  0.31  0.13 -0.14 -0.12 -0.20 -0.19
## PACF 0.57 -0.02 -0.06 -0.28  0.13 -0.18  0.04
my_model <- arima(data_ts, order=c(1,1,1))
stdres <- my_model$residuals/sqrt(my_model$sigma2)
plot(my_model$residuals, main = "Chosen model - ARIMA(1, 1, 1) - Residuals")

qqnorm(stdres, main = "Chosen model - ARIMA(1, 1, 1) - Normal Q-Q Plot of Std Residuals")

shapiro.test(residuals(my_model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(my_model)
## W = 0.90685, p-value = 0.01662
my_forecast = predict(arima(data_ts ,order=c(1,1,1)), n.ahead=5)
ts.plot(data_ts, 
        my_forecast$pred,  
        main="Romania Earthquake - Magniture mean per year prediction", 
        col=1:2,
        xlab="Year", 
        ylab="Magnitude mean", 
        lwd=1)